In [1]:
from em_utilities import *
import sframe as sf
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.neighbors import NearestNeighbors
import scipy
import time

Section 0:

Dataset definition and feature extraction (tf-idf)


In [2]:
dataset= sf.SFrame('Dataset/KO_data.csv')
dataset.remove_column('X1')
dataset= dataset.add_row_number()
dataset.rename({'id':'X1'})


[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1504805937.log
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Finished parsing file /home/abdl-rahman/Desktop/Recommendation systems/EM for clustering/Dataset/KO_data.csv
Parsing completed. Parsed 1423 lines in 0.318258 secs.
Out[2]:
X1 file_name category text
0 training-dataset/engineer
ing/912.txt ...
engineering Uber s case for
incremental processin ...
1 training-
dataset/business/747.txt ...
business On the Road to Recap Why
the Unicorn Financing ...
2 training-
dataset/product/919.txt ...
product How designers can use
data to create amazing ...
3 training-
dataset/business/222.txt ...
business The Arc of Company Life
and How to Prolong ItOn ...
4 training-
dataset/business/238.txt ...
business Advice to Grads Join A
Winning Startup v 2016 ...
5 training-
dataset/product/297.txt ...
product GV Guide to Design
Critique GV LibraryGV ...
6 training-
dataset/product/1281.txt ...
product Beating designer s
blockThose hours or days ...
7 training-
dataset/product/310.txt ...
product How to create effective
push notificationsOver ...
8 training-
dataset/product/160.txt ...
product Thoughtbot s Kyle Fiedler
Know yourself and trust ...
9 training-
dataset/product/92.txt ...
product A product team s friend
or foe Feature Req ...
[1423 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [3]:
tfidfvec= TfidfVectorizer(stop_words='english')
tf_idf_matrix= tfidfvec.fit_transform(dataset['text'])
tf_idf_matrix = normalize(tf_idf_matrix)

Section 1:

Model Parameters smart initialization

Used Kmeans++ model to initialize the parameters for the model of EM algorithm.

  • Kmeans++ used to initialize the means (Centroids of clusters)

In [4]:
#Smart Initialization for means with using KMeans++ model 
def initialize_means(num_clusters,features_matrix):
    from sklearn.cluster import KMeans
    np.random.seed(5)
    kmeans_model = KMeans(n_clusters=num_clusters, init='k-means++', n_init=5, max_iter=400, random_state=1, n_jobs=1)
    kmeans_model.fit(features_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    means = [centroid for centroid in centroids]
    return [means , cluster_assignment]

In [5]:
#Smart initialization for weights
def initialize_weights(num_clusters,features_matrix,cluster_assignment):
    num_docs = features_matrix.shape[0]
    weights = []
    for i in xrange(num_clusters):
        num_assigned = len(cluster_assignment[cluster_assignment==i]) # YOUR CODE HERE
        w = float(num_assigned) / num_docs
        weights.append(w)
    return weights

In [6]:
#Smart initialization for covariances
def initialize_covs(num_clusters,features_matrix,cluster_assignment):
    covs = []
    for i in xrange(num_clusters):
        member_rows = features_matrix[cluster_assignment==i]
        cov = (member_rows.multiply(member_rows) - 2*member_rows.dot(diag(means[i]))).sum(axis=0).A1 / member_rows.shape[0] \
        + means[i]**2
        cov[cov < 1e-8] = 1e-8
        covs.append(cov)
    return covs

Section 2:

Training Models with different number of clusters

Initializing the parameters for each model then start training using the Expectation-Maximization algorithm.


In [7]:
# Model 1 with 10 clusters
(means , cluster_assignment_10model)= initialize_means(10,tf_idf_matrix)
covs= initialize_covs(10,tf_idf_matrix, cluster_assignment_10model)
weights= initialize_weights(10,tf_idf_matrix, cluster_assignment_10model)
model_em_10k= EM_for_high_dimension(tf_idf_matrix, means, covs, weights, cov_smoothing=1e-10)

In [8]:
# Model 2 with 20 clusters.
(means , cluster_assignment_20model)= initialize_means(20,tf_idf_matrix)
covs= initialize_covs(20,tf_idf_matrix, cluster_assignment_20model)
weights= initialize_weights(20,tf_idf_matrix, cluster_assignment_20model)
model_em_20k= EM_for_high_dimension(tf_idf_matrix, means, covs, weights, cov_smoothing=1e-10)

Section 3:

Evaluation report for each cluster (Interpreting clusters)

Evaluation report is divided into two partitions the first one is the word representation for each cluster the really interpret the cluster, the second one is for the variety of article types in one cluster counting each category for each cluster.


In [9]:
def visualize_EM_clusters(tf_idf, means, covs, map_index_to_word):
    print('')
    print('==========================================================')

    num_clusters = len(means)
    for c in xrange(num_clusters):
        print('Cluster {0:d}: Largest mean parameters in cluster '.format(c))
        print('\n{0: <12}{1: <12}{2: <12}'.format('Word', 'Mean', 'Variance'))
        
        # The k'th element of sorted_word_ids should be the index of the word 
        # that has the k'th-largest value in the cluster mean. Hint: Use np.argsort().
        sorted_word_ids = np.argsort(means[c])[::-1]

        for i in sorted_word_ids[:10]:
            print '{0: <12}{1:<10.2e}{2:10.2e}'.format(map_index_to_word[i], 
                                                       means[c][i],
                                                       covs[c][i])
        print '\n=========================================================='

In [10]:
def clusters_report(clusters_idx):
    cluster_id=0
    for cluster_indicies in clusters_idx:
        countP=0
        countB=0
        countE=0
        for i in cluster_indicies:
            if dataset['category'][i]=='product':
                countP+=1
            elif dataset['category'][i]=='engineering':
                countE+=1
            elif dataset['category'][i]=='business':
                countB+=1
        print "Cluster ",cluster_id ,"\n==========================\n"
        cluster_id+=1
        print "product count : ",countP ,"\nengineering count : ",countE,"\nbusiness count : ",countB , "\n"

In [11]:
visualize_EM_clusters(tf_idf_matrix, model_em_10k['means'], model_em_10k['covs'], tfidfvec.get_feature_names())


==========================================================
Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
company     5.94e-02    3.71e-03
team        5.82e-02    5.92e-03
people      5.53e-02    3.06e-03
startup     4.38e-02    4.79e-03
time        3.97e-02    8.44e-04
work        3.75e-02    2.13e-03
business    3.51e-02    2.63e-03
product     3.15e-02    1.48e-03
don         3.06e-02    1.23e-03
companies   3.04e-02    1.94e-03

==========================================================
Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
users       9.49e-02    8.42e-03
user        8.22e-02    7.33e-03
app         5.62e-02    1.19e-02
design      5.55e-02    3.78e-03
product     5.03e-02    3.25e-03
onboarding  4.32e-02    1.66e-02
mobile      4.22e-02    1.05e-02
use         3.58e-02    8.12e-04
content     3.37e-02    5.02e-03
people      3.20e-02    2.25e-03

==========================================================
Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
data        8.54e-02    1.29e-02
microservices6.97e-02    2.55e-02
serverless  4.64e-02    2.18e-02
code        4.38e-02    4.24e-03
services    4.33e-02    4.63e-03
service     4.30e-02    4.96e-03
database    3.67e-02    7.93e-03
architecture3.30e-02    2.35e-03
application 3.06e-02    1.76e-03
server      2.88e-02    2.92e-03

==========================================================
Cluster 3: Largest mean parameters in cluster 

Word        Mean        Variance    
xero        3.87e-01    3.82e-06
vet         2.43e-01    1.51e-06
institute   2.17e-01    1.20e-06
codigodelsur1.98e-01    1.00e-06
tech        1.78e-01    8.05e-07
veteran     1.66e-01    7.02e-07
accounting  1.55e-01    6.14e-07
firms       1.38e-01    4.88e-07
founder     1.30e-01    4.33e-07
highly      1.01e-01    2.59e-07

==========================================================
Cluster 4: Largest mean parameters in cluster 

Word        Mean        Variance    
netflix     6.53e-01    4.43e-03
blog        2.52e-01    7.47e-03
technology  1.76e-01    4.55e-03
regarding   1.62e-01    3.88e-03
perspectives1.59e-01    3.70e-03
tech        1.36e-01    3.53e-03
issues      1.02e-01    1.13e-03
challenges  1.00e-01    1.47e-03
focused     9.90e-02    1.44e-03
decisions   9.52e-02    1.33e-03

==========================================================
Cluster 5: Largest mean parameters in cluster 

Word        Mean        Variance    
rails       1.37e-01    5.11e-02
ruby        6.74e-02    1.20e-02
phoenix     5.48e-02    2.14e-02
elixir      5.14e-02    1.93e-02
language    3.78e-02    8.65e-03
akka        3.52e-02    1.34e-02
data        3.50e-02    3.00e-03
combinator  3.45e-02    3.33e-02
redirecting 3.45e-02    3.33e-02
quip        3.45e-02    3.33e-02

==========================================================
Cluster 6: Largest mean parameters in cluster 

Word        Mean        Variance    
people      2.38e-02    1.17e-03
product     2.19e-02    1.27e-03
like        1.91e-02    4.32e-04
new         1.79e-02    5.90e-04
companies   1.74e-02    1.13e-03
time        1.65e-02    4.72e-04
learning    1.64e-02    3.60e-03
value       1.60e-02    2.68e-03
facebook    1.59e-02    3.07e-03
business    1.57e-02    1.06e-03

==========================================================
Cluster 7: Largest mean parameters in cluster 

Word        Mean        Variance    
design      2.47e-01    1.52e-02
designers   1.07e-01    1.35e-02
team        7.60e-02    6.50e-03
sprint      6.79e-02    2.73e-02
product     5.76e-02    3.48e-03
work        5.57e-02    2.97e-03
designer    5.27e-02    2.61e-03
project     4.04e-02    4.25e-03
people      3.75e-02    1.56e-03
sprints     3.74e-02    9.59e-03

==========================================================
Cluster 8: Largest mean parameters in cluster 

Word        Mean        Variance    
product     2.44e-01    9.90e-03
customer    5.83e-02    6.51e-03
team        5.78e-02    4.46e-03
customers   5.65e-02    5.00e-03
manager     4.77e-02    5.05e-03
management  4.02e-02    4.19e-03
managers    3.89e-02    3.50e-03
market      3.73e-02    5.22e-03
products    3.62e-02    1.62e-03
people      3.38e-02    1.53e-03

==========================================================
Cluster 9: Largest mean parameters in cluster 

Word        Mean        Variance    
ux          2.04e-01    4.35e-02
design      1.42e-01    9.81e-03
designer    1.22e-01    1.99e-02
newslog     5.16e-02    2.23e-02
product     4.86e-02    4.48e-03
user        4.23e-02    2.88e-03
experience  3.84e-02    2.32e-03
designers   3.67e-02    1.69e-03
meets       3.58e-02    1.05e-02
ideas       3.56e-02    8.95e-03

==========================================================

In [12]:
visualize_EM_clusters(tf_idf_matrix, model_em_20k['means'], model_em_20k['covs'], tfidfvec.get_feature_names())


==========================================================
Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
design      2.93e-01    1.06e-02
designers   8.87e-02    1.02e-02
sprint      8.35e-02    3.31e-02
designer    7.68e-02    1.20e-02
team        6.71e-02    5.77e-03
product     5.61e-02    3.87e-03
sprints     4.57e-02    1.17e-02
work        4.15e-02    1.68e-03
process     3.70e-02    1.78e-03
people      3.29e-02    1.34e-03

==========================================================
Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
sketch      5.86e-02    2.19e-02
javascript  5.79e-02    1.60e-02
react       4.94e-02    1.69e-02
page        4.61e-02    5.32e-03
font        4.50e-02    1.87e-02
user        4.08e-02    2.93e-03
code        4.04e-02    2.74e-03
ember       3.48e-02    1.51e-02
fonts       3.42e-02    1.00e-02
like        3.27e-02    5.69e-04

==========================================================
Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
product     1.71e-01    7.63e-03
customer    1.04e-01    1.05e-02
customers   9.78e-02    7.32e-03
users       6.04e-02    6.33e-03
user        4.96e-02    4.58e-03
market      4.83e-02    7.52e-03
value       4.51e-02    6.08e-03
onboarding  4.15e-02    1.43e-02
marketing   4.06e-02    5.49e-03
pricing     3.86e-02    1.70e-02

==========================================================
Cluster 3: Largest mean parameters in cluster 

Word        Mean        Variance    
people      5.99e-02    3.65e-03
time        3.59e-02    1.10e-03
like        3.37e-02    9.41e-04
work        3.36e-02    2.79e-03
product     3.16e-02    1.69e-03
ve          2.99e-02    1.22e-03
things      2.88e-02    1.22e-03
don         2.87e-02    1.17e-03
just        2.75e-02    7.10e-04
company     2.74e-02    1.51e-03

==========================================================
Cluster 4: Largest mean parameters in cluster 

Word        Mean        Variance    
netflix     6.53e-01    4.43e-03
blog        2.52e-01    7.47e-03
technology  1.76e-01    4.55e-03
regarding   1.62e-01    3.88e-03
perspectives1.59e-01    3.70e-03
tech        1.36e-01    3.53e-03
issues      1.02e-01    1.13e-03
challenges  1.00e-01    1.47e-03
focused     9.90e-02    1.44e-03
decisions   9.52e-02    1.33e-03

==========================================================
Cluster 5: Largest mean parameters in cluster 

Word        Mean        Variance    
app         8.27e-02    1.70e-02
mobile      7.09e-02    1.57e-02
platform    5.48e-02    1.22e-02
users       5.33e-02    4.81e-03
facebook    5.24e-02    1.03e-02
apps        4.91e-02    5.26e-03
platforms   4.29e-02    8.52e-03
video       4.01e-02    1.16e-02
user        3.98e-02    2.75e-03
people      3.72e-02    1.88e-03

==========================================================
Cluster 6: Largest mean parameters in cluster 

Word        Mean        Variance    
xero        3.87e-01    3.82e-06
vet         2.43e-01    1.51e-06
institute   2.17e-01    1.20e-06
codigodelsur1.98e-01    1.00e-06
tech        1.78e-01    8.05e-07
veteran     1.66e-01    7.02e-07
accounting  1.55e-01    6.14e-07
firms       1.38e-01    4.88e-07
founder     1.30e-01    4.33e-07
highly      1.01e-01    2.59e-07

==========================================================
Cluster 7: Largest mean parameters in cluster 

Word        Mean        Variance    
serverless  4.78e-01    2.55e-02
faas        3.02e-01    1.01e-01
processing  1.55e-01    3.39e-02
task        9.23e-02    1.38e-02
functions   9.18e-02    3.01e-03
lambda      9.13e-02    6.25e-03
application 8.14e-02    3.03e-03
medium      7.89e-02    7.08e-02
paas        6.89e-02    5.53e-03
gateway     6.84e-02    5.04e-03

==========================================================
Cluster 8: Largest mean parameters in cluster 

Word        Mean        Variance    
startup     1.90e-01    7.94e-03
company     9.08e-02    6.92e-03
founders    8.96e-02    7.81e-03
founder     6.73e-02    6.78e-03
startups    6.46e-02    4.41e-03
business    6.22e-02    3.93e-03
product     5.70e-02    3.12e-03
people      5.34e-02    1.70e-03
growth      5.01e-02    1.01e-02
idea        4.96e-02    5.47e-03

==========================================================
Cluster 9: Largest mean parameters in cluster 

Word        Mean        Variance    
memory      1.97e-01    4.25e-02
virtual     4.67e-02    8.59e-03
file        4.59e-02    1.71e-02
cpu         3.86e-02    4.23e-03
disk        3.82e-02    5.06e-03
data        3.77e-02    1.41e-03
linux       3.73e-02    5.01e-03
code        3.59e-02    4.63e-03
ring        3.58e-02    1.56e-02
postgres    3.55e-02    1.93e-02

==========================================================
Cluster 10: Largest mean parameters in cluster 

Word        Mean        Variance    
ux          1.18e-01    3.41e-02
design      1.04e-01    5.87e-03
project     9.64e-02    1.74e-02
user        6.72e-02    8.70e-03
designers   5.59e-02    9.39e-03
users       5.17e-02    6.04e-03
designer    4.94e-02    3.67e-03
client      4.93e-02    9.98e-03
research    4.91e-02    8.98e-03
product     4.45e-02    3.46e-03

==========================================================
Cluster 11: Largest mean parameters in cluster 

Word        Mean        Variance    
data        2.05e-01    1.91e-02
database    5.26e-02    1.34e-02
kafka       4.14e-02    1.06e-02
use         2.97e-02    6.30e-04
user        2.88e-02    1.66e-03
metrics     2.71e-02    2.52e-03
analytics   2.70e-02    6.32e-03
schema      2.65e-02    3.64e-03
conversion  2.42e-02    7.70e-03
using       2.41e-02    5.14e-04

==========================================================
Cluster 12: Largest mean parameters in cluster 

Word        Mean        Variance    
trump       3.07e-01    3.17e-02
vcs         1.12e-01    5.56e-02
combinator  8.80e-02    6.97e-02
startup     7.11e-02    4.47e-02
hillary     6.86e-02    1.65e-02
lps         5.63e-02    3.80e-02
start       5.40e-02    2.50e-02
jonah       4.97e-02    2.97e-02
country     4.75e-02    5.00e-03
election    4.41e-02    1.43e-03

==========================================================
Cluster 13: Largest mean parameters in cluster 

Word        Mean        Variance    
investors   8.78e-02    9.17e-03
capital     6.44e-02    6.17e-03
company     6.04e-02    3.48e-03
companies   5.58e-02    3.84e-03
founders    5.50e-02    5.33e-03
market      5.19e-02    4.69e-03
business    4.94e-02    4.91e-03
startup     4.69e-02    2.87e-03
money       4.56e-02    3.50e-03
equity      4.48e-02    1.30e-02

==========================================================
Cluster 14: Largest mean parameters in cluster 

Word        Mean        Variance    
learning    1.30e-01    1.90e-02
machine     1.16e-01    1.53e-02
neural      5.77e-02    1.43e-02
data        5.42e-02    6.61e-03
deep        5.37e-02    5.25e-03
ai          4.53e-02    1.43e-02
intelligence4.53e-02    1.26e-02
computers   3.40e-02    6.70e-03
quantum     3.31e-02    1.99e-02
et          3.18e-02    1.40e-02

==========================================================
Cluster 15: Largest mean parameters in cluster 

Word        Mean        Variance    
microservices4.84e-02    1.92e-02
code        4.22e-02    6.33e-03
service     3.37e-02    4.40e-03
services    3.25e-02    3.91e-03
uber        2.99e-02    1.05e-02
data        2.40e-02    1.23e-03
new         2.06e-02    6.57e-04
architecture1.86e-02    1.52e-03
infrastructure1.85e-02    2.37e-03
time        1.82e-02    4.06e-04

==========================================================
Cluster 16: Largest mean parameters in cluster 

Word        Mean        Variance    
product     2.98e-01    1.15e-02
manager     8.90e-02    7.62e-03
management  7.96e-02    6.60e-03
managers    6.72e-02    5.66e-03
pm          5.06e-02    1.78e-02
team        4.99e-02    2.03e-03
development 4.44e-02    5.82e-03
products    3.66e-02    1.68e-03
role        3.36e-02    3.06e-03
people      2.84e-02    9.51e-04

==========================================================
Cluster 17: Largest mean parameters in cluster 

Word        Mean        Variance    
culture     1.66e-01    4.50e-02
teams       1.42e-01    2.06e-02
team        1.26e-01    1.12e-02
innovation  8.65e-02    2.56e-02
employees   8.13e-02    1.25e-02
company     6.95e-02    5.01e-03
people      5.61e-02    1.60e-03
work        4.80e-02    1.05e-03
ideas       4.79e-02    4.39e-03
innovative  4.39e-02    9.51e-03

==========================================================
Cluster 18: Largest mean parameters in cluster 

Word        Mean        Variance    
content     2.23e-01    2.19e-02
seo         6.51e-02    2.64e-02
blog        4.60e-02    8.35e-03
page        4.60e-02    4.04e-03
search      4.01e-02    1.16e-02
product     3.84e-02    3.18e-03
readers     3.56e-02    5.51e-03
notifications3.44e-02    2.23e-02
reader      3.06e-02    4.38e-03
gestures    3.00e-02    1.37e-02

==========================================================
Cluster 19: Largest mean parameters in cluster 

Word        Mean        Variance    
team        1.28e-01    8.92e-03
product     8.06e-02    5.43e-03
people      5.19e-02    2.33e-03
company     4.93e-02    2.57e-03
teams       4.72e-02    3.51e-03
time        4.32e-02    1.15e-03
work        3.97e-02    1.15e-03
says        3.90e-02    4.63e-03
new         3.03e-02    9.49e-04
goals       2.72e-02    6.12e-03

==========================================================

In [13]:
# No. of articles in each cluster for first model with 10 clusters
resps_10k= sf.SFrame(model_em_10k['resp'])
resps_10k= resps_10k.unpack('X1', '')
cluster_id=0
cluster_hash_10model = {}
for col in resps_10k.column_names():
    cluster_10k= np.array(resps_10k[col])
    print "cluster ",cluster_id , "assignments: ", cluster_10k.sum()
    cluster_hash_10model[cluster_id] =cluster_10k.nonzero() 
    cluster_id+=1


cluster  0 assignments:  359.0
cluster  1 assignments:  135.0
cluster  2 assignments:  139.0
cluster  3 assignments:  11.0
cluster  4 assignments:  26.0
cluster  5 assignments:  29.0
cluster  6 assignments:  365.0
cluster  7 assignments:  92.0
cluster  8 assignments:  230.0
cluster  9 assignments:  37.0

In [14]:
# No. of articles in each cluster for second model with 20 clusters
resps_20k= sf.SFrame(model_em_20k['resp'])
resps_20k= resps_20k.unpack('X1', '')
cluster_id=0
cluster_hash_20model = {}
for col in resps_20k.column_names():
    cluster_20k= np.array(resps_20k[col])
    print "cluster ",cluster_id , "assignments: ", cluster_20k.sum()
    cluster_hash_20model[cluster_id] =cluster_20k.nonzero() 
    cluster_id+=1


cluster  0 assignments:  73.0
cluster  1 assignments:  43.0
cluster  2 assignments:  141.0
cluster  3 assignments:  257.0
cluster  4 assignments:  26.0
cluster  5 assignments:  86.0
cluster  6 assignments:  11.0
cluster  7 assignments:  13.0
cluster  8 assignments:  39.0
cluster  9 assignments:  17.0
cluster  10 assignments:  62.0
cluster  11 assignments:  48.0
cluster  12 assignments:  13.0
cluster  13 assignments:  95.0
cluster  14 assignments:  37.0
cluster  15 assignments:  189.0
cluster  16 assignments:  99.0
cluster  17 assignments:  23.0
cluster  18 assignments:  21.0
cluster  19 assignments:  130.0

In [15]:
# Articles' categories in model 1 with 10 clusters
clusters_10k_idx=[]
for col in resps_10k.column_names():
    cluster_10k= np.array(resps_10k[col])
    cluster_10k= cluster_10k.nonzero()[0]
    clusters_10k_idx.append(cluster_10k)
clusters_report(clusters_10k_idx)


Cluster  0 
==========================

product count :  53 
engineering count :  7 
business count :  299 

Cluster  1 
==========================

product count :  105 
engineering count :  17 
business count :  13 

Cluster  2 
==========================

product count :  3 
engineering count :  131 
business count :  5 

Cluster  3 
==========================

product count :  0 
engineering count :  0 
business count :  11 

Cluster  4 
==========================

product count :  1 
engineering count :  24 
business count :  1 

Cluster  5 
==========================

product count :  0 
engineering count :  27 
business count :  3 

Cluster  6 
==========================

product count :  94 
engineering count :  85 
business count :  186 

Cluster  7 
==========================

product count :  82 
engineering count :  0 
business count :  10 

Cluster  8 
==========================

product count :  182 
engineering count :  1 
business count :  47 

Cluster  9 
==========================

product count :  32 
engineering count :  2 
business count :  3 


In [16]:
# Articles' categories in model 2 with 20 clusters
clusters_20k_idx=[]
for col in resps_20k.column_names():
    cluster_20k= np.array(resps_20k[col])
    cluster_20k= cluster_20k.nonzero()[0]
    clusters_20k_idx.append(cluster_20k)
clusters_report(clusters_20k_idx)


Cluster  0 
==========================

product count :  66 
engineering count :  2 
business count :  5 

Cluster  1 
==========================

product count :  18 
engineering count :  24 
business count :  1 

Cluster  2 
==========================

product count :  93 
engineering count :  0 
business count :  48 

Cluster  3 
==========================

product count :  85 
engineering count :  5 
business count :  167 

Cluster  4 
==========================

product count :  1 
engineering count :  24 
business count :  1 

Cluster  5 
==========================

product count :  32 
engineering count :  9 
business count :  45 

Cluster  6 
==========================

product count :  0 
engineering count :  0 
business count :  11 

Cluster  7 
==========================

product count :  1 
engineering count :  12 
business count :  0 

Cluster  8 
==========================

product count :  0 
engineering count :  0 
business count :  39 

Cluster  9 
==========================

product count :  1 
engineering count :  16 
business count :  0 

Cluster  10 
==========================

product count :  53 
engineering count :  1 
business count :  8 

Cluster  11 
==========================

product count :  9 
engineering count :  34 
business count :  5 

Cluster  12 
==========================

product count :  4 
engineering count :  0 
business count :  9 

Cluster  13 
==========================

product count :  1 
engineering count :  0 
business count :  94 

Cluster  14 
==========================

product count :  6 
engineering count :  22 
business count :  9 

Cluster  15 
==========================

product count :  17 
engineering count :  137 
business count :  35 

Cluster  16 
==========================

product count :  98 
engineering count :  0 
business count :  1 

Cluster  17 
==========================

product count :  6 
engineering count :  2 
business count :  15 

Cluster  18 
==========================

product count :  12 
engineering count :  1 
business count :  8 

Cluster  19 
==========================

product count :  49 
engineering count :  5 
business count :  76 

Section 4

Recommendation and predictions for Articles

Recommendation method:

A method for recommending articles by retrieving the cluster that the article belong to, then fetch all the articles in that cluster articles passed to nearest neighbour model to find the best 10 articles recommended for this article.

Predicting method:

Sending set of articles to predict the cluster it belong based on the trained data

  • Using the test dataset to predict cluster for each one using two different models.

In [17]:
def articles_inds(article_id , cluster_hash_model):
    for cluster_id in cluster_hash_model: 
        np_array = np.array(cluster_hash_model[cluster_id])
        if article_id in np_array:
            return cluster_id, np_array

In [18]:
def recommender(article_id ,cluster_hash_model, no_articles, data_articles):
    start_time = time.time()
    cid , inds = articles_inds(article_id ,cluster_hash_model)
    cluster_articles= data_articles.filter_by(inds[0] , 'X1')
    cluster_articles = cluster_articles.add_row_number()

    recom_vec= TfidfVectorizer(stop_words='english')
    tfidf_recommend= recom_vec.fit_transform(cluster_articles['text'])
    tfidf_recommend = normalize(tfidf_recommend)
    
    row_id = cluster_articles[cluster_articles['X1']==article_id]['id'][0]
    NN_model = NearestNeighbors(n_neighbors=no_articles).fit(tfidf_recommend)
    distances, indices = NN_model.kneighbors(tfidf_recommend[row_id])
    
    recommended_ids=[]
    for i in indices[0]:
        recommended_ids.append(cluster_articles[cluster_articles['id']==i]['X1'][0])
    
    del cluster_articles
    del tfidf_recommend
    del recom_vec
    #print("--- %s seconds ---" % (time.time() - start_time))
    #print len(inds[0])
    return recommended_ids

In [19]:
def predict_cluster(articles,em_model):
    article_tfidf= tfidfvec.transform(articles['text'])
    mu= deepcopy(em_model['means'])
    sigma= deepcopy(em_model['covs'])
    assignments=[]
    for j in range(article_tfidf.shape[0]):
        resps=[]
        for i in range(len(em_model['weights'])):
            predict= np.log(em_model['weights'][i]) + logpdf_diagonal_gaussian(article_tfidf[j], mu[i],sigma[i])
            resps.append(predict)
        assignments.append(resps.index(np.max(resps)))
    return assignments

In [21]:
# Recommend articles for all dataset then append it into the SFrame database then export it.
recommended_inds = []
start_time = time.time()
for i in range(len(dataset)):
    recommended_inds.append(recommender(i,cluster_hash_20model,11,dataset))

print("--- %s seconds (Final time complexity): ---" % (time.time() - start_time))


--- 517.98885417 seconds (Final time complexity): ---

In [22]:
rec_inds= sf.SArray(recommended_inds)
dataset.add_column(rec_inds,name='recommendations')


Out[22]:
X1 file_name category text recommendations
0 training-dataset/engineer
ing/912.txt ...
engineering Uber s case for
incremental processin ...
[0.0, 334.0, 1289.0,
638.0, 1414.0, 413.0, ...
1 training-
dataset/business/747.txt ...
business On the Road to Recap Why
the Unicorn Financing ...
[1.0, 1378.0, 545.0,
398.0, 1238.0, 752.0, ...
2 training-
dataset/product/919.txt ...
product How designers can use
data to create amazing ...
[2.0, 740.0, 254.0,
397.0, 1331.0, 1139.0, ...
3 training-
dataset/business/222.txt ...
business The Arc of Company Life
and How to Prolong ItOn ...
[3.0, 432.0, 111.0, 1.0,
752.0, 621.0, 1317.0, ...
4 training-
dataset/business/238.txt ...
business Advice to Grads Join A
Winning Startup v 2016 ...
[4.0, 890.0, 1086.0,
572.0, 281.0, 707.0, ...
5 training-
dataset/product/297.txt ...
product GV Guide to Design
Critique GV LibraryGV ...
[5.0, 1236.0, 207.0,
235.0, 1251.0, 523.0, ...
6 training-
dataset/product/1281.txt ...
product Beating designer s
blockThose hours or days ...
[6.0, 1316.0, 25.0,
1280.0, 609.0, 1145.0, ...
7 training-
dataset/product/310.txt ...
product How to create effective
push notificationsOver ...
[7.0, 209.0, 113.0,
924.0, 1161.0, 853.0, ...
8 training-
dataset/product/160.txt ...
product Thoughtbot s Kyle Fiedler
Know yourself and trust ...
[8.0, 862.0, 523.0,
879.0, 1024.0, 1251.0, ...
9 training-
dataset/product/92.txt ...
product A product team s friend
or foe Feature Req ...
[9.0, 1116.0, 507.0,
121.0, 692.0, 605.0, ...
[1423 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [23]:
dataset.save('Articles_with_recommendations.csv',format='csv')

In [24]:
#Saving each cluster data in a seperate CSV file
for cluster_id in cluster_hash_20model:
    ind= np.array(cluster_hash_20model[cluster_id])
    #print ind
    cluster_articles= dataset.filter_by(ind[0] , 'X1')
    cluster_articles.save('Clusters_model20/cluster_'+str(cluster_id)+'.csv',format='csv')
    del cluster_articles

Testing data for cluster assigning.


In [25]:
testset = sf.SFrame('Dataset/KO_articles_test.csv')


Finished parsing file /home/abdl-rahman/Desktop/Recommendation systems/EM for clustering/Dataset/KO_articles_test.csv
Parsing completed. Parsed 97 lines in 0.099565 secs.
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Finished parsing file /home/abdl-rahman/Desktop/Recommendation systems/EM for clustering/Dataset/KO_articles_test.csv
Parsing completed. Parsed 97 lines in 0.062474 secs.

In [26]:
test_tfidf= tfidfvec.transform(testset['text'])
# Predict Using model with 10 clusters.
test_predictions= predict_cluster(testset,model_em_10k)
test_predictions= np.array(test_predictions)
test_predictions


Out[26]:
array([0, 6, 0, 0, 6, 0, 0, 0, 0, 0, 0, 6, 6, 0, 0, 0, 0, 0, 2, 8, 1, 2, 2,
       2, 2, 0, 0, 6, 1, 0, 2, 0, 1, 7, 2, 2, 2, 2, 2, 2, 6, 5, 0, 2, 2, 6,
       2, 1, 2, 2, 2, 2, 0, 0, 6, 1, 0, 2, 0, 1, 7, 2, 2, 2, 2, 2, 2, 6, 5,
       0, 2, 2, 6, 2, 6, 1, 0, 6, 0, 0, 0, 6, 6, 0, 0, 6, 0, 0, 8, 6, 0, 6,
       1, 8, 0, 0, 0])

In [27]:
# Predict Using model with 20 clusters.
test_predictions= predict_cluster(testset,model_em_20k)
test_predictions= np.array(test_predictions)
test_predictions


Out[27]:
array([13, 15,  5, 15, 13,  3, 19,  3,  3,  2,  3,  3, 15,  3, 19,  3, 19,
        3, 15,  2, 15, 11, 11, 15, 15, 13,  3, 14,  1, 15, 15, 15, 15,  2,
       15, 15, 15, 15, 15,  0, 15, 10, 15, 15, 15, 15, 15, 15, 11, 11, 15,
       15, 13,  3, 14,  1, 15, 15, 15, 15,  2, 15, 15, 15, 15, 15,  0, 15,
       10, 15, 15, 15, 15, 15,  3,  1,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        5, 19, 19,  2,  3, 19,  5, 15, 16,  3,  2,  3])

In [ ]: